setwd("/home/timo/projects/patnlp/tasks/Teemade visualiseerija")

# 1. loeme sisse andmed
data <- read.csv2("data.txt",
                  header=TRUE, as.is=TRUE, quote="")


# PRE-PROCESSING
# --------------
library(tm)

# corpus
corpus <- Corpus(VectorSource(data$tekst))

## remove stopwords
#stopwords_est <- c("ja", "ning", "ega", "ehk", "kuid")
#corpus <- tm_map(corpus, removeWords, stopwords_est)

# document term matrix
dtm <- DocumentTermMatrix(corpus,
                          control = list(minWordLength = 3,
                                         removeNumbers = TRUE, 
                                         removePunctuation = TRUE))

# tfidf
library("slam")

term_tfidf <-
  tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
  log2(nDocs(dtm)/col_sums(dtm > 0))

dtm <- dtm[, term_tfidf >= median(term_tfidf)]
ind <- row_sums(dtm) > 0
dtm <- dtm[ind,]
data <- data[ind, ]

# LDA
# ---
library("topicmodels")
k <- 50
SEED <- 2013
TM = LDA(dtm, k = k, control = list(seed = SEED))

## the most likely topic for each document
#Topic <- topics(TM, 1)

## five most frequent terms for each topic
#Terms <- terms(TM, 10)
#Terms[, 1:5]

data1 <- data$tekst
save(TM, data1, file="LDA_vis/data_k50.RData")

# APP
library(shiny)
runApp("LDA_vis")